library(foreign)
mid = read.dta("C:/Users/Eric/Dropbox/NSC Group Decisionmaking/Analysis/dafoe_caughey_replication/3-Analysis-Data/15-03-13-USMIDs1.dta")

mid <- mutate(mid,
              EndedWithinTerm = Pres2 == "",
              ## Non-fishing MIDs
              nonfishing = FishDisp == 0 & !is.na(FishDisp),
              ## Non-fishing MIDs where both countries were originators
              primary = Primary == 1 & FishDisp == 0 & !is.na(Primary),
              ## Non-fishing MIDs where both countries were alone on their side
              bilateral = BothAlone == 1 & FishDisp == 0 & !is.na(BothAlone),
              ## Presidents covered by time range of MID dataset (1816-2010)
              in.data = TermEndDate1 > as.Date("1816-01-01") &
                TermStartDate1 < as.Date("2010-06-01"),
              ## Presidents who experienced at least one non-fishing MID
              non0nofish = nMIDsNoF > 0 & !is.na(nMIDsNoF),
              ## Presidents who experienced at least one primary MID
              non0prim = nMIDsPrim > 0 & !is.na(nMIDsPrim),
              ## Presidents that experienced at least one bilateral MID
              non0bilat = nMIDsAlone > 0 & !is.na(nMIDsAlone),
              ## First presidential observation in each MID subset
              lf = LeaderOb == 1 & non0nofish,
              lp = LeaderOb == 1 & non0prim,
              lb = LeaderOb == 1 & non0bilat,
              SouthFactor = factor(ifelse(South1 == 0, 'Non-Southern',
                                          'Southern')))

#### RESPONSE VARIABLES
### (1) U.S. use of force in MID
  summary(mid$ForceUS)
### (2) Duration of MID (days, censored at end of originating president's term)
  summary(mid$DurationUS <- mid$DaysInTrm1)
### (3) Outcome
  mid$outcome <- ordered(mid$outcome, labels = c("US Loss", "Draw", "US Win"))
  summary(mid$outcome)
  mid$OutcomeUS <- as.numeric(mid$outcome) - 2
  table(mid$outcome, mid$OutcomeUS)

#### STRUCTURAL (PRE-PRESIDENCY) COVARIATES
  mid <- mutate(mid, ongoing.mid = NA,
                great.power = as.integer(Era == "1897-1945" | Era == "post-1945"),
                super.power = as.integer(Era == "post-1945"))

### Ongoing MID
  for (i in 1:nrow(mid)) {
    last.leader <- as.numeric(mid$Leader) == as.numeric(mid$Leader[i]) - 1
    mid$ongoing.mid[i] <- any(mid$spillover[last.leader] == 1, na.rm = TRUE)
  }
  mid$ongoing.mid <- as.integer(mid$ongoing.mid)

### Number of Americans killed in the last war before the president's
### (adjusted) term began:
  mid$prev.war.dead <- round(exp(mid$lnprevdead), 0)
  mid$prev.war.dead[mid$Leader == "HarrisonWH"] <- 2260
  mid$prev.war.dead[mid$Leader == "Polk"] <- 2260
  mid$prev.war.dead[mid$Leader == "Taylor"] <- 13283
  mid$prev.war.dead[mid$Leader == "Lincoln"] <- 13283
  mid$prev.war.dead[as.numeric(mid$Leader) >= 17 & as.numeric(mid$Leader) <= 24] <-
    620000  ## Includes Confederate dead in Civil War
  mid$prev.war.dead[mid$Leader == "JohnsonLB"] <- 36574
  mid$prev.war.dead[mid$Leader == "Clinton" | mid$Leader=="BushGW"] <- 382
  mid$prev.war.dead[mid$Leader == "Obama"] <- 4222
  mid$log.prev.war.dead <- log(mid$prev.war.dead)

### Years since last war
  mid$yrs.since.war <- rep(NA, nrow(mid))
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) == 4, 0, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 5 &
                                  as.numeric(mid$Leader) <= 11,
                              mid$TermStartYear - 1815, mid$yrs.since.war)
  mid$yrs.since.war[mid$Leader=="HarrisonWH"] <- 1841 - 1815
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 12 &
                                  as.numeric(mid$Leader) <= 16,
                              mid$TermStartYear - 1848, mid$yrs.since.war)
  mid$yrs.since.war[mid$Leader=="Taylor"] <- 1849 - 1848
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 17 &
                                  as.numeric(mid$Leader) <= 24,
                              mid$TermStartYear - 1865, mid$yrs.since.war)
  mid$yrs.since.war[mid$Leader=="Garfield"] <- 1881 - 1865
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 25 &
                                  as.numeric(mid$Leader) <= 27,
                              mid$TermStartYear - 1898, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 28 &
                                  as.numeric(mid$Leader) <= 31,
                              mid$TermStartYear - 1918, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) == 32,
                              mid$TermStartYear - 1945, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 33 &
                                  as.numeric(mid$Leader) <= 35,
                              mid$TermStartYear - 1953, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 36 &
                                  as.numeric(mid$Leader) <= 40,
                              mid$TermStartYear - 1973, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(mid$yrs.since.war < 0, 0, mid$yrs.since.war)
  mid$yrs.since.war <- ifelse(as.numeric(mid$Leader) >= 41 &
                                  as.numeric(mid$Leader) <= 42,
                              mid$TermStartYear - 1991, mid$yrs.since.war)
  mid$yrs.since.war[mid$Leader %in% "Obama"] <- 0
  mid$log.yrs.since.war <- log(mid$yrs.since.war + 1)
### Interaction of Log Years Since War and Log Deaths
  mid$log.dead.x.log.years <- mid$log.prev.war.dead * mid$log.yrs.since.war
### Americans killed in last war, as percentage of U.S. population
  ## U.S. population figures obtained from:
  ## http://www.census.gov/population/www/cen2000/maps/respop.html
  us.pop <- data.frame(matrix(, ncol = 2, byrow = TRUE, data =
                                  c(2000, 281421906,
                                    1990, 248709873,
                                    1980, 226542199,
                                    1970, 203302031,
                                    1960, 179323175,
                                    1950, 151325798,
                                    1940, 132164569,
                                    1930, 123202624,
                                    1920, 106021537,
                                    1910, 92228496,
                                    1900, 76212168,
                                    1890, 62979766,
                                    1880, 50189209,
                                    1870, 38558371,
                                    1860, 31443321,
                                    1850, 23191876,
                                    1840, 17063353,
                                    1830, 12860702,
                                    1820, 9638453,
                                    1810, 7239881,
                                    1800, 5308483,
                                    1790, 3929214)))
  mid$prev.war.dead.pct <- rep(NA, nrow(mid))
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 2260,
                                  (2260 / 7239881) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 13283,
                                  (13283 / 23191876) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 620000,
                                  (620000 / 31443321) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 2446,
                                  (2446 / 76212168) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 116516,
                                  (116516 / 106021537) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 405399,
                                  (405399 / 132164569) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 36574,
                                  (36574 / 151325798) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 58209,
                                  (58209 / 203302031) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 382,
                                  (382 / 248709873) * 100,
                                  mid$prev.war.dead.pct)
  mid$prev.war.dead.pct <- ifelse(mid$prev.war.dead == 4222,
                                  (4222 / 281421906) * 100,
                                  mid$prev.war.dead.pct)
  ##
### Log Previous War Dead Percent (to deal with skew)
  mid$log.war.dead.pct <- log(mid$prev.war.dead.pct)
  ##
### Interaction of Log Previous War Dead Percent and Log Years Since
### War
  mid$log.prv.dead.pct.x.log.yrs <- mid$log.war.dead.pct * mid$log.yrs.since.war
### Ongoing War
  mid$ongoing.war <- as.integer(mid$yrs.since.war == 0 & mid$Pres1 != "JohnsonA")
  ## Note: Lee's surrender at Appomatox occurred about a
  ## week before Andrew Johnson ascended to the presidency, so he is
  ## coded as having come into office without a war ongoing.
  ## Also: Even though the War of 1812 began in his presidency, Madison
  ## is coded as mid$ongoing.war = 1 because war was ongoing at the point
  ## when the MID data begin (1816).
  ##
### Elite Veteran (imputed)
  ## Impute an elite veteran percentage of 40 (should be approximately
  ## right) for Clinton and BushGW, for whom the data are missing.
  mid$elitevet.imput <- ifelse(mid$Leader %in% c("Clinton", "BushGW", "Obama"),
                               40, mid$elitevet)
### Lagged value of treatment (South1)
  mid$South1.prev <- NA
  for (i in 1:nrow(mid)){
    mid$South1.prev[i] <-
      mid$South1[as.numeric(mid$Leader) == as.numeric(mid$Leader[i]) - 1][1]
  }
### Number of non-fishing MIDs in previous ten (five) years
  mid$nMIDs.prev10 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    mid$nMIDs.prev10[i] <-
      sum(mid$StartDateUS > (term.start - 365.25*10)
          & mid$EndDateUS < term.start & mid$nonfishing, na.rm = TRUE)
  }
  mid$nMIDs.prev5 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    mid$nMIDs.prev5[i] <-
      sum(mid$StartDateUS > (term.start - 365.25*5)
          & mid$EndDateUS < term.start & mid$nonfishing, na.rm = TRUE)
  }

  ## Replace the values for Madison and Monroe to the number of MIDs in
  ## the ten years AFTER the end of Monroe's term, which is 2.
  mid$nMIDs.prev10[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    sum(mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
        & mid$StartDateUS < (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 365*10)
        & mid$nonfishing, na.rm = TRUE)
  ## Same for 5 years
  mid$nMIDs.prev5[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    sum(mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
        & mid$StartDateUS < (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 365*5)
        & mid$nonfishing, na.rm = TRUE)
### Uses of Force in previous 5 and 10 years
  mid$ForceUS.prev5 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    (term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1])
    mid$ForceUS.prev5[i] <-
      sum(mid$StartDateUS > (term.start-365.25*5) & mid$EndDateUS < term.start
          & mid$nonfishing & mid$ForceUS == 1, na.rm = TRUE)
  }
  mid$ForceUS.prev10 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    mid$ForceUS.prev10[i] <-
      sum(mid$StartDateUS > (term.start-365.25*10) & mid$EndDateUS < term.start
          & mid$nonfishing & mid$ForceUS == 1, na.rm = TRUE)
  }
  ## Replace the values for Madison and Monroe to the number of uses of
  ## Force in the ten years AFTER the end of Monroe's term, which is
  ## 1. This actually only changes Madison's value, which was 0
  ## (Monroe's was already 1).
  mid$ForceUS.prev10[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    sum(mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
        & mid$StartDateUS < (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 10*365)
        & (mid$nonfishing & mid$ForceUS == 1), na.rm=TRUE)
  mid$ForceUS.prev5[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    sum(mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
        & mid$StartDateUS < (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 365.25*5)
        & mid$nonfishing & mid$ForceUS == 1, na.rm=TRUE)
### Uses of force per MID in last ten years
  mid$ForceUS.prop.prev5 <- mid$ForceUS.prev5 / mid$nMIDs.prev5
  mid$ForceUS.prop.prev10 <- mid$ForceUS.prev10 / mid$nMIDs.prev10

### Log Duration
  mid$ave.log.duration.prev5 <- rep(NA, nrow(mid))
  mid$ave.log.duration.prev10 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    pre5 <- (mid$StartDateUS > (term.start - 5 * 365.25) &
                 mid$EndDateUS < term.start)
    pre10 <- (mid$StartDateUS > (term.start - 10 * 365.25) &
                  mid$EndDateUS < term.start)
    mid$ave.log.duration.prev5[i] <-
      mean(log(mid$DurationUS[pre5 & mid$nonfishing]), na.rm = TRUE)
    mid$ave.log.duration.prev10[i] <-
      mean(log(mid$DurationUS[pre10 & mid$nonfishing]), na.rm = TRUE)
  }
  ## Replace the values for Madison and Monroe
  post5mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
              & mid$nonfishing & mid$StartDateUS <
                  (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 365.25*5))
  post10mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
               & mid$nonfishing & mid$StartDateUS <
                   (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 365.25*10))
  mid$ave.log.duration.prev5[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean(log(mid$DurationUS[post5mm]), na.rm = TRUE)
  mid$ave.log.duration.prev10[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean(log(mid$DurationUS[post10mm]), na.rm = TRUE)

### Duration
  mid$ave.duration.prev5 <- rep(NA, nrow(mid))
  mid$ave.duration.prev10 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    pre5 <- (mid$StartDateUS > (term.start - 5 * 365.25) &
                 mid$EndDateUS < term.start)
    pre10 <- (mid$StartDateUS > (term.start - 10 * 365.25) &
                  mid$EndDateUS < term.start)
    mid$ave.duration.prev5[i] <-
      mean(mid$DurationUS[pre5 & mid$nonfishing], na.rm = TRUE)
    mid$ave.duration.prev10[i] <-
      mean(mid$DurationUS[pre10 & mid$nonfishing], na.rm = TRUE)
  }
  ## Replace the values for Madison and Monroe
  post5mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1] &
                  mid$nonfishing & mid$StartDateUS <
                      (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 5 * 365.25))
  post10mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
               & mid$nonfishing & mid$StartDateUS <
                   (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 10 * 365.25))
  mid$ave.duration.prev5[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean((mid$DurationUS[post5mm]), na.rm = TRUE)
  mid$ave.duration.prev10[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean((mid$DurationUS[post10mm]), na.rm = TRUE)

### Outcome
  mid$ave.outcome.prev5 <- rep(NA, nrow(mid))
  mid$ave.outcome.prev10 <- rep(NA, nrow(mid))
  for (i in 1:nrow(mid)){
    term.start <- mid$TermStartDate1[mid$Leader == mid$Leader[i]][1]
    pre5 <- (mid$StartDateUS > (term.start - 5 * 365.25) &
                 mid$EndDateUS < term.start)
    pre10 <- (mid$StartDateUS > (term.start - 10 * 365.25) &
                  mid$EndDateUS < term.start)
    mid$ave.outcome.prev5[i] <-
      mean(mid$OutcomeUS[pre5 & mid$nonfishing], na.rm = TRUE)
    mid$ave.outcome.prev10[i] <-
      mean(mid$OutcomeUS[pre10 & mid$nonfishing], na.rm = TRUE)
  }
  ## Replace the values for Madison and Monroe
  post5mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
              & mid$nonfishing & mid$StartDateUS <
                  (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 5 * 365.25))
  post10mm <- (mid$StartDateUS > mid$TermEndDate1[mid$Leader=="Monroe"][1]
               & mid$nonfishing & mid$StartDateUS <
                   (mid$TermEndDate1[mid$Leader=="Monroe"][1] + 10 * 365.25))
  mid$ave.outcome.prev5[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean(mid$OutcomeUS[post5mm], na.rm = TRUE)
  mid$ave.outcome.prev10[mid$Leader=="Madison" | mid$Leader=="Monroe"] <-
    mean(mid$OutcomeUS[post10mm], na.rm = TRUE)

### Party and military variables
  mid <- mutate(mid,
                Dem = as.integer(Party1Factor == "Democratic"), 
                DemRep = as.integer(Party1Factor == "Democratic-Republican"),
                Repub = as.integer(Party1Factor == "Republican"),
                Whig = as.integer(Party1Factor == "Whig"),
                DemDR = Dem + DemRep,
                military.occupation = as.integer(MiltOcc == "yes"),
                military.experience = as.integer(MiltExp == "no"))

  
write.csv(mid, "C:/Users/Eric/Dropbox/NSC Group Decisionmaking/Analysis/dafoe_caughey_replication/dafoe_caughey_vars.csv")
